options(repos = "http://cran.us.r-project.org")
#import and load the required packages
#install.packages("tidyverse")
#install.packages("plotly")
library(tidyverse)
## ── Attaching core tidyverse packages ─────────────────────────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.0 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.1.8
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ───────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
if (!require(ggpubr)) {
install.packages("ggpubr")
}
## Loading required package: ggpubr
if (!require(moments)) {
install.packages("moments")
}
## Loading required package: moments
library(ggpubr)
gapminder <- read_csv("gapminder_clean.csv")
## New names:
## Rows: 2607 Columns: 20
## ── Column specification
## ─────────────────────────────────────────────────────────────────────────── Delimiter: "," chr
## (2): Country Name, continent dbl (18): ...1, Year, Agriculture, value added (% of GDP), CO2
## emissions (me...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ Specify the column types
## or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
gapminder
## # A tibble: 2,607 × 20
## ...1 Country…¹ Year Agric…² CO2 e…³ Domes…⁴ Elect…⁵ Energ…⁶ Expor…⁷ Ferti…⁸
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 Afghanis… 1962 NA 0.0738 21.3 NA NA 4.88 7.45
## 2 1 Afghanis… 1967 NA 0.124 9.92 NA NA 6.77 7.45
## 3 2 Afghanis… 1972 NA 0.131 18.9 NA NA 14.8 7.45
## 4 3 Afghanis… 1977 NA 0.183 13.8 NA NA 11.7 7.45
## 5 4 Afghanis… 1982 NA 0.166 NA NA NA NA 7.45
## 6 5 Afghanis… 1987 NA 0.276 NA NA NA NA 7.46
## 7 6 Afghanis… 1992 NA 0.101 NA NA NA NA 7.50
## 8 7 Afghanis… 1997 NA 0.0608 NA NA NA NA 7.64
## 9 8 Afghanis… 2002 38.5 0.0411 NA NA NA 32.4 7.27
## 10 9 Afghanis… 2007 30.6 0.0879 0.535 NA NA 17.8 6.44
## # … with 2,597 more rows, 10 more variables: `GDP growth (annual %)` <dbl>,
## # `Imports of goods and services (% of GDP)` <dbl>,
## # `Industry, value added (% of GDP)` <dbl>,
## # `Inflation, GDP deflator (annual %)` <dbl>,
## # `Life expectancy at birth, total (years)` <dbl>,
## # `Population density (people per sq. km of land area)` <dbl>,
## # `Services, etc., value added (% of GDP)` <dbl>, pop <dbl>, …
gapminder_filtered <- filter(gapminder, Year == 1962)
gapminder_filtered
## # A tibble: 259 × 20
## ...1 Country…¹ Year Agric…² CO2 e…³ Domes…⁴ Elect…⁵ Energ…⁶ Expor…⁷ Ferti…⁸
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 Afghanis… 1962 NA 0.0738 21.3 NA NA 4.88 7.45
## 2 10 Albania 1962 NA 1.44 NA NA NA NA 6.28
## 3 20 Algeria 1962 NA 0.485 NA NA NA 19.8 7.61
## 4 30 American… 1962 NA NA NA NA NA NA NA
## 5 40 Andorra 1962 NA NA NA NA NA NA NA
## 6 50 Angola 1962 NA 0.216 NA NA NA NA 7.40
## 7 60 Antigua … 1962 NA 1.82 NA NA NA NA 4.34
## 8 70 Arab Wor… 1962 NA 0.761 18.2 NA NA NA 6.96
## 9 80 Argentina 1962 NA 2.52 17.3 NA NA 4.69 3.09
## 10 90 Armenia 1962 NA NA NA NA NA NA 4.43
## # … with 249 more rows, 10 more variables: `GDP growth (annual %)` <dbl>,
## # `Imports of goods and services (% of GDP)` <dbl>,
## # `Industry, value added (% of GDP)` <dbl>,
## # `Inflation, GDP deflator (annual %)` <dbl>,
## # `Life expectancy at birth, total (years)` <dbl>,
## # `Population density (people per sq. km of land area)` <dbl>,
## # `Services, etc., value added (% of GDP)` <dbl>, pop <dbl>, …
gapminder_plot <- ggplot(gapminder_filtered, aes(x = gdpPercap,
y = `CO2 emissions (metric tons per capita)`)) +
geom_point()
ggsave("gapminderplot.png")
## Saving 7 x 5 in image
## Warning: Removed 151 rows containing missing values (`geom_point()`).
gapminder_plot
## Warning: Removed 151 rows containing missing values (`geom_point()`).
correlation <- cor(gapminder_filtered$`CO2 emissions (metric tons per capita)`,
gapminder_filtered$gdpPercap, use = "complete.obs")
p_value <- cor.test(gapminder_filtered$`CO2 emissions (metric tons per capita)`,
gapminder_filtered$gdpPercap)$p.value
correlation
## [1] 0.9260817
p_value
## [1] 1.128679e-46
#cor_by_year <- gapminder |>
# group_by(Year) |>
# summarise(cor(gapminder_filtered$`CO2 emissions (metric tons per capita)`,
# gapminder_filtered$gdpPercap, use = "complete.obs"))
# strongest_year_row <- filter(cor_by_year, correlation == max(correlation))
# strongest_year_row
# strongest_year <- strongest_year_row$Year
# strongest_year
gapminder_filtered <- gapminder %>%
filter(!is.na(`CO2 emissions (metric tons per capita)`), !is.na(gdpPercap))
cor_by_year <- gapminder_filtered %>%
group_by(Year) %>%
summarise(correlation = cor(`CO2 emissions (metric tons per capita)`, gdpPercap, use = "complete.obs"))
cor_by_year
## # A tibble: 10 × 2
## Year correlation
## <dbl> <dbl>
## 1 1962 0.926
## 2 1967 0.939
## 3 1972 0.843
## 4 1977 0.793
## 5 1982 0.817
## 6 1987 0.810
## 7 1992 0.809
## 8 1997 0.808
## 9 2002 0.801
## 10 2007 0.720
strongest_year_row <- cor_by_year %>%
filter(correlation == max(correlation)) %>%
slice(1)
strongest_year <- strongest_year_row
strongest_year
## # A tibble: 1 × 2
## Year correlation
## <dbl> <dbl>
## 1 1967 0.939
gapminder_strongest_year <- gapminder %>%
filter(Year == 1967)
gapminder_strongest_year
## # A tibble: 259 × 20
## ...1 Country…¹ Year Agric…² CO2 e…³ Domes…⁴ Elect…⁵ Energ…⁶ Expor…⁷ Ferti…⁸
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 Afghanis… 1967 NA 0.124 9.92 NA NA 6.77 7.45
## 2 11 Albania 1967 NA 1.36 NA NA NA NA 5.39
## 3 21 Algeria 1967 10.3 0.632 28.0 NA NA 23.4 7.67
## 4 31 American… 1967 NA NA NA NA NA NA NA
## 5 41 Andorra 1967 NA NA NA NA NA NA NA
## 6 51 Angola 1967 NA 0.167 NA NA NA NA 7.40
## 7 61 Antigua … 1967 NA 9.11 NA NA NA NA 4.04
## 8 71 Arab Wor… 1967 NA 1.33 31.3 NA NA NA 6.93
## 9 81 Argentina 1967 9.98 2.86 18.7 NA NA 7.50 3.05
## 10 91 Armenia 1967 NA NA NA NA NA NA 3.61
## # … with 249 more rows, 10 more variables: `GDP growth (annual %)` <dbl>,
## # `Imports of goods and services (% of GDP)` <dbl>,
## # `Industry, value added (% of GDP)` <dbl>,
## # `Inflation, GDP deflator (annual %)` <dbl>,
## # `Life expectancy at birth, total (years)` <dbl>,
## # `Population density (people per sq. km of land area)` <dbl>,
## # `Services, etc., value added (% of GDP)` <dbl>, pop <dbl>, …
strongest_year_plot <- ggplot(gapminder_strongest_year,
aes(x = gdpPercap, y = `CO2 emissions (metric tons per capita)`,
color = continent,
size = pop)) +
geom_point()
ggplotly(strongest_year_plot)
We want to test if there is a statistically significant difference in the Energy use per continent. Thus, continent is the predictor variable, and energy use is the outcome variable.
To use a parametric test, we must ensure that three assumptions are met: Normality, equal variances, and independence.
Normality assumption: To test for normality, we conduct a Shapiro-Wilk Test for Normality.
#Conduct Shapiro-Wilk Test for normality
shapiro.test(gapminder$"Energy use (kg of oil equivalent per capita)")
##
## Shapiro-Wilk normality test
##
## data: gapminder$"Energy use (kg of oil equivalent per capita)"
## W = 0.62442, p-value < 2.2e-16
As we can see, the p value of the test is < 2.2e-16, which is less than the alpha level of 0.05. This suggests that the samples do not come from a normal distribution.
Therefore, we conduct a Kruskal-Wallis H test
kruskal.test(`Energy use (kg of oil equivalent per capita)` ~ continent, data = gapminder)
##
## Kruskal-Wallis rank sum test
##
## data: Energy use (kg of oil equivalent per capita) by continent
## Kruskal-Wallis chi-squared = 318.68, df = 4, p-value < 2.2e-16
Thus can reject the null hypothesis that there is no difference in the mean ranks of energy use across continents.
ggplot(gapminder, aes(x = `Energy use (kg of oil equivalent per capita)`)) +
geom_histogram(bins = 30) +
facet_wrap(~continent) +
labs(title = "Histogram of energy use per capita by continent",
x = "Energy use (kg of oil equivalent per capita)",
y = "Frequency")
## Warning: Removed 1197 rows containing non-finite values (`stat_bin()`).
#convert ggplot2 plot to plotly plot
ggplotly()
## Warning: Removed 1197 rows containing non-finite values (`stat_bin()`).
Is there a significant difference between Europe and Asia with respect to ‘Imports of goods and services (% of GDP)’ in the years after 1990? (stats test needed)
gapminder_years <- gapminder %>%
filter(Year > 1990)
# ggqqplot(data = gapminder_years, x = 'Imports of goods and services (% of GDP)', facet.by = 'continent')
# Filter data for the years after 1990
data <- gapminder %>%
filter(Year > 1990) %>%
select(continent, Year, `Imports of goods and services (% of GDP)`)
# Rename the variable for convenience
data <- data %>%
rename(gdp = `Imports of goods and services (% of GDP)`)
# Plot Q-Q plot with facet by continent
ggqqplot(data = data, x = "gdp", facet.by = "continent")
## Warning: Removed 152 rows containing non-finite values (`stat_qq()`).
## Warning: Removed 152 rows containing non-finite values (`stat_qq_line()`).
## Removed 152 rows containing non-finite values (`stat_qq_line()`).
### Shapiro-Wilk Test for Normality
europe <- gapminder_years %>%
filter(continent == "Europe")
asia <- gapminder_years %>%
filter(continent == "Asia")
skewness(europe$'Imports of goods and services (% of GDP)', na.rm = TRUE)
## [1] 0.8351207
kurtosis(europe$'Imports of goods and services (% of GDP)', na.rm = TRUE)
## [1] 2.945533
shapiro.test(europe$'Imports of goods and services (% of GDP)'[complete.cases(data)])
##
## Shapiro-Wilk normality test
##
## data: europe$"Imports of goods and services (% of GDP)"[complete.cases(data)]
## W = 0.94204, p-value = 0.0124
skewness(asia$'Imports of goods and services (% of GDP)', na.rm = TRUE)
## [1] 1.770528
kurtosis(asia$'Imports of goods and services (% of GDP)', na.rm = TRUE)
## [1] 7.311968
shapiro.test(asia$'Imports of goods and services (% of GDP)'[complete.cases(data)])
##
## Shapiro-Wilk normality test
##
## data: asia$"Imports of goods and services (% of GDP)"[complete.cases(data)]
## W = 0.83268, p-value = 9.337e-06
As we can see, the p-values are less than 0.05. Therefore, we can reject the null hypothesis that the data is normally distributed, and we can apply a log transform.